/* 02A_NAC_NEWVARS.do             KTS/DCC/NLB              yyyy-mm-dd:2026-02-04
----|----1----|----2----|----3----|----4----|----5----|----6----|----7----|----8

This do file creates the data used in the paper "Estimating Inter­generational
Returns to Medical Care: New Evidence from At­Risk Newborns" written by
Damian Clarke, Nicolas Lillo Bustos and Kathya Tapia-Schythe.  
In certain cases these results will require the user-written, labutil,
personage commands. 
*/

clear all 
set more off 
timer on 1

*-------------------------------------------------------------------------------
*--- 1. Generate mother birth data (needs pre-processed NAC data only)
*-------------------------------------------------------------------------------

* Load full birth data (including duplicates and NAs):
use "${nac_original}_NOGLOSAS.dta", clear

* Obtain database minimum and maximum year:
sum $byear_var
scalar min_ano_nac = r(min)
scalar max_ano_nac = r(max)

* Get list of unique mother IDs, and save them as ID_RECIEN_NACIDO:
keep ID_MADRE
drop if ID_MADRE == "NA" | ID_MADRE == ""
duplicates drop ID_MADRE, force
rename ID_MADRE ID_RECIEN_NACIDO
label var ID_RECIEN_NACIDO "" // remove variable label
save "ID_RECIEN_NACIDO_of_unique_ID_MADRE.dta", replace

* Load full birth data, excluding ID duplicates and NAs:
use "${nac_original}_NOGLOSAS_NODUPS_NONAS.dta", clear

* Merge in list of unique mother ID's:
merge 1:1 ID_RECIEN_NACIDO using "ID_RECIEN_NACIDO_of_unique_ID_MADRE.dta", keep(match) nogen

* Rename father variables to grandfather:
foreach var of varlist *_PADRE {
	local varlbl : variable label `var'
	local nvar = subinstr("`var'", "_PADRE", "_ABUELO", .)
	rename `var' `nvar'
	label var `nvar' "Abuelo: `varlbl'"
}

* Rename mother variables to grandmother:
foreach var of varlist *_MADRE {
	local varlbl : variable label `var'
	local nvar = subinstr("`var'", "_MADRE", "_ABUELA", .)
	rename `var' `nvar'
	label var `nvar' "Abuela: `varlbl'"
}

* Rename geographic variables:
rename COMUNA_RESIDENCIA COMUNA_RESIDENCIA_ABUELA
rename REGION_RESIDENCIA REGION_RESIDENCIA_ABUELA
rename SERV_RES SERV_RES_ABUELA
rename URBANO_RURAL URBANO_RURAL_ABUELA

foreach var of varlist COMUNA_RESIDENCIA_ABUELA REGION_RESIDENCIA_ABUELA SERV_RES_ABUELA URBANO_RURAL_ABUELA {
	local varlbl : variable label `var'
	label var `var' "Abuela: `varlbl'"
}

* Relabel mother variables:
ds *_ABUELO *_ABUELA ID_RECIEN_NACIDO, not
foreach var of varlist `r(varlist)' {
	local varlbl : variable label `var'
	rename `var' `var'_MADRE
	label var `var'_MADRE "Madre: `varlbl'"
}

* Rename and relabel ID variables:
rename ID_RECIEN_NACIDO ID_MADRE
label var ID_MADRE "Identificador único y anónimo de la madre del recién nacido vivo"

* Compress, label, sign, and save mother birth data:
compress
label data "Mother birth data (based on DEIS's Nacimientos `=min_ano_nac'-`=max_ano_nac')"
notes drop _dta
note: Last modified by: $id_user_full ($id_user_email)
note: Last modification timestamp: $S_DATE at $S_TIME
save "MOTHER_BIRTHDATA.dta", replace

*-------------------------------------------------------------------------------
*--- 2. Generate fertility variables (needs pre-processed NAC data only)
*-------------------------------------------------------------------------------

* Load data:
use "${nac_original}_NOGLOSAS_NODUPS_NONAS.dta", clear

* Calculate important scalars:
sum $byear_var
scalar min_ano_nac = r(min)
scalar max_ano_nac = r(max)
scalar max_mother_age = max_ano_nac - min_ano_nac

* Drop observations with unknown mother id:
drop if ID_MADRE == "NA" | ID_MADRE == ""

* Sort:
sort ID_MADRE FECHA_NACIMIENTO_SIF

* Get birth order:
by ID_MADRE FECHA_NACIMIENTO_SIF: egen birth_order_by_mother_date_u = rank(FECHA_NACIMIENTO_SIF) if ID_MADRE != "NA", unique
by ID_MADRE FECHA_NACIMIENTO_SIF: egen birth_order_by_mother_date_t = rank(FECHA_NACIMIENTO_SIF) if ID_MADRE != "NA", track

* Keep only births of mothers for whom we have birth data:
merge m:1 ID_MADRE using MOTHER_BIRTHDATA.dta, nogen keepusing(FECHA_NACIMIENTO_SIF_MADRE) keep(match)

* Recalculate age using mother's birth date:
personage FECHA_NACIMIENTO_SIF_MADRE FECHA_NACIMIENTO_SIF, g(EDAD_MADRE_2)

* Obtain number of children and number of births by mother id and age of the mother:
collapse 	(max) nchilds_at_ = birth_order_by_mother_date_u ///
			(max) nbirths_at_ = birth_order_by_mother_date_t, ///
			by(ID_MADRE EDAD_MADRE_2)

* Merge in rest of women in the database:
gen ID_RECIEN_NACIDO = ID_MADRE
drop ID_MADRE
merge m:1 ID_RECIEN_NACIDO ///
	using "${nac_original}_NOGLOSAS_NODUPS_NONAS.dta", ///
	gen(mrg_NAC) keepusing(FECHA_NACIMIENTO_SIF SEXO) keep(match using)

keep if SEXO == 2
drop SEXO

timer on 2
* Fillin missing ID/Age combinations:
fillin ID_RECIEN_NACIDO EDAD_MADRE_2
timer off 2

* Assign birthdate to filled in obs:
by ID_RECIEN_NACIDO: egen aux = min(FECHA_NACIMIENTO_SIF)
replace FECHA_NACIMIENTO_SIF = aux if FECHA_NACIMIENTO_SIF == . & _fillin == 1

* Replace filled with missing with 0s:
replace nchilds_at_ = 0 if nchilds_at_ == . & _fillin == 1
replace nbirths_at_ = 0 if nbirths_at_ == . & _fillin == 1

* Calculate running sum of children and births by mother and age:
sort ID_RECIEN_NACIDO EDAD_MADRE_2
by ID_RECIEN_NACIDO: gen nchilds_by_ = sum(nchilds_at_) - nchilds_at_
by ID_RECIEN_NACIDO: gen nbirths_by_ = sum(nbirths_at_) - nbirths_at_

* Drop ages below 15 and over the maximum theoretical age:
drop if EDAD_MADRE_2 < 15 | EDAD_MADRE_2 > max_mother_age | EDAD_MADRE_2 == .

* Compress
compress

* Find age at end of database:
personage FECHA_NACIMIENTO_SIF, currdate(`=mdy(12, 31, `=max_ano_nac')') g(age_at_end_of_dbase)

* Drop ID/Age combinations not feasible by end of database:
drop if EDAD_MADRE_2 > age_at_end_of_dbase

* Merge in death date:
gen ID_FALLECIDO = ID_RECIEN_NACIDO
merge m:1 ID_FALLECIDO using DEF_1990_2018_NOGLOSAS_NODUPS_NONAS.dta, gen(mrg_DEF) keep(master match) keepusing(FECHA_DEF_SIF)

* Find age at death:
personage FECHA_NACIMIENTO_SIF FECHA_DEF_SIF if mrg_DEF == 3, g(age_at_death)

* Drop ID/Age combinations after death of a person:
drop if EDAD_MADRE_2 > age_at_death & mrg_DEF == 3

* Reshape wide:
timer on 3

* Keep relevant variables:
keep ID_RECIEN_NACIDO EDAD_MADRE_2 nchilds_at_ nbirths_at_ nchilds_by_ nbirths_by_

* Do reshape:
reshape wide nchilds_at_ nbirths_at_ nchilds_by_ nbirths_by_, i(ID_RECIEN_NACIDO) j(EDAD_MADRE_2) 
timer off 3

* Label:
foreach var of varlist nchilds_at_?? {
	local aa = real(word(subinstr("`var'", "_", " ", .), -1))
	label var `var' "Number of children born at age `aa'"
}
foreach var of varlist nbirths_at_?? {
	local aa = real(word(subinstr("`var'", "_", " ", .), -1))
	label var `var' "Number of births at age `aa'"
}
foreach var of varlist nchilds_by_?? {
	local aa = real(word(subinstr("`var'", "_", " ", .), -1))
	label var `var' "Number of children born by age `aa'"
}
foreach var of varlist nbirths_by_?? {
	local aa = real(word(subinstr("`var'", "_", " ", .), -1))
	label var `var' "Number of births by age `aa'"
}


* Compress, label, sign, and save mother birth data:
compress
label data "Fertility variables for ages 15 to `=max_mother_age' for women born `=min_ano_nac'-`=max_ano_nac'"
notes drop _dta
note: Last modified by: $id_user_full ($id_user_email)
note: Last modification timestamp: $S_DATE at $S_TIME
save "FERTILITY_VARIABLES.dta", replace

*-------------------------------------------------------------------------------
*--- 3. Generate birth spacing (needs pre-processed NAC data only)
*-------------------------------------------------------------------------------

use "${nac_original}_NOGLOSAS_NODUPS_NONAS.dta", clear

* Calculate important scalars:
sum $byear_var
scalar min_ano_nac = r(min)
scalar max_ano_nac = r(max)
scalar max_mother_age = max_ano_nac - min_ano_nac

* Keep only relevant variables:
keep ID_RECIEN_NACIDO ID_MADRE FECHA_NACIMIENTO_SIF SEMANAS

* Drop observations with unknown mother id:
drop if ID_MADRE == "NA" | ID_MADRE == ""

* Sort:
sort ID_MADRE FECHA_NACIMIENTO_SIF

* Get birth order:
by ID_MADRE: egen birth_order_by_mother_u = rank(FECHA_NACIMIENTO_SIF), unique
by ID_MADRE: egen birth_order_by_mother_t = rank(FECHA_NACIMIENTO_SIF), track

* Tag first borns and last borns:
by ID_MADRE: egen nchilds_by_mother = max(birth_order_by_mother_u)
by ID_MADRE: egen nbirths_by_mother = max(birth_order_by_mother_t)

* Create conception date (week):
gen bdatew = wofd(FECHA_NACIMIENTO_SIF)
gen cdatew = bdatew - SEMANAS

* Get next conception date (week) after current birth date (week):
by ID_MADRE: gen next_cdatew = cdatew[_n + 1] if bdatew < cdatew[_n + 1]
bys ID_MADRE birth_order_by_mother_t: egen min_next_cdatew = min(next_cdatew)
drop next_cdatew
rename min_next_cdatew next_cdatew

* Get previous birth date (week) prior to current conception date (week):
by ID_MADRE: gen prev_bdatew = bdatew[_n - 1] if cdatew > bdatew[_n - 1]
bys ID_MADRE birth_order_by_mother_t: egen min_prev_bdatew = min(prev_bdatew)
drop prev_bdatew
rename min_prev_bdatew prev_bdatew

* Format weekly variables:
format *datew %tw

* Generate a dummy variable for whether or not a mother has another child:
gen another_birth = nbirths_by_mother > birth_order_by_mother_t

* Calculate forward (f) and backwards (b) birthspacing:
gen bspcngf = next_cdatew - bdatew
gen bspcngb = cdatew - prev_bdatew

* Calculate alternative definition for forward (f) birthspacing:
gen bspcngf2 = yw(max_ano_nac + 1, 1) - bdatew if another_birth == 0
replace bspcngf2 = bspcngf if another_birth == 1
note bspcngf2: same as bspcngf but inputing the weeks until end of `=max_ano_nac' for births without followup births.

* Label newly created varaibles:
label var nchilds_by_mother "Total number of children by ID_MADRE"
label var nbirths_by_mother "Total number of births by ID_MADRE"
label var bdatew "Birth date week"
label var cdatew "Conception date week"
label var next_cdatew "Next conception date week by ID_MADRE"
label var prev_bdatew "Previous birthdate week by ID_MADRE"
label var another_birth "Mother has another birth after this one"
label var bspcngf "Forward birth spacing: weeks until next conception from current birth week"
label var bspcngb "Backward birth spacing: weeks since last birthday to current conception week"
label var bspcngf2 "Forward birth spacing: weeks until next conception from current birth week (censored)"

* Compress, label, sign, and save mother birth data:
compress
label data "Birth spacing variables"
notes drop _dta
note: Last modified by: $id_user_full ($id_user_email)
note: Last modification timestamp: $S_DATE at $S_TIME
save "BIRTH_SPACING.dta", replace

*-------------------------------------------------------------------------------
*--- 4. Generate Small for Gestation Age (SGA) variables (needs pre-processed NAC data only)
*-------------------------------------------------------------------------------

* Load data:
use "${nac_original}_NOGLOSAS_NODUPS_NONAS.dta", clear

* Calculate important scalars:
sum $byear_var
scalar min_ano_nac = r(min)
scalar max_ano_nac = r(max)
scalar max_mother_age = max_ano_nac - min_ano_nac

* Keep relevant variables:
keep ID_RECIEN_NACIDO ANO_NAC SEMANAS PESO TALLA SEXO

* Calculate statistics by year/weeks of gestation:
bys SEXO ANO_NAC SEMANAS: egen count_PESO = count(PESO)
bys SEXO ANO_NAC SEMANAS: egen mean_PESO = mean(PESO)
bys SEXO ANO_NAC SEMANAS: egen sd_PESO = sd(PESO)

bys SEXO ANO_NAC SEMANAS: egen count_TALLA = count(TALLA)
bys SEXO ANO_NAC SEMANAS: egen mean_TALLA = mean(TALLA)
bys SEXO ANO_NAC SEMANAS: egen sd_TALLA = sd(TALLA)

* Define dummy variable for SGA children based on weight:
gen sgabw = PESO < mean_PESO - 2 * sd_PESO if PESO != . & mean_PESO != . & sd_PESO != . & count_PESO >= 30 & count_PESO != .
label var sgabw "Small for Gestational Age (Birth Weight < Mean - 2 * Std. Dev.)"
note sgabw: "SGA is defined as a birth weight and/or birth length greater than 2 standard deviations (SD) below the population reference mean for gestational age." (Boguszewski et al. BMC Pediatrics 2011, 11:66)
note sgabw: Set to missing if number of births in SEXO/ANO_NAC/SEMANAS bin is less than 30.

* Define dummy variable for SGA children based on height:
gen sgasz = TALLA < mean_TALLA - 2 * sd_TALLA if TALLA != . & mean_TALLA != . & sd_TALLA != . & count_TALLA >= 30 & count_PESO != .
label var sgasz "Small for Gestational Age (Birth Size < Mean - 2 * Std. Dev.)"
note sgasz: "SGA is defined as a birth weight and/or birth length greater than 2 standard deviations (SD) below the population reference mean for gestational age." (Boguszewski et al. BMC Pediatrics 2011, 11:66)
note sgasz: Set to missing if number of births in SEXO/ANO_NAC/SEMANAS bin is less than 30.

* Drop unnecessary variables:
drop count_* mean_* sd_*

* Compress, label, sign, and save mother birth data:
compress
label data "SGA variables"
notes drop _dta
note: Last modified by: $id_user_full ($id_user_email)
note: Last modification timestamp: $S_DATE at $S_TIME
save "SGA_VARIABLES.dta", replace

*-------------------------------------------------------------------------------
*--- 5. Generate labour market activity variables (needs pre-processed NAC data only)
*-------------------------------------------------------------------------------

* Load data:
use "${nac_original}_NOGLOSAS_NODUPS_NONAS.dta", clear

* Define list of problematic years:
local probyears 1992 2000 2007 2015

* Keep only relevant variables:
keep ID_RECIEN_NACIDO ID_MADRE FECHA_NACIMIENTO_SIF ANO_NAC ACTIV_?ADRE

* Identify problematic years:
gen byte problematic_year = inlist(ANO_NAC, `=subinstr("`probyears'", " ", ", ", .)')
label var problematic_year "Issues with the income variable for this year"

* Sort:
sort ID_MADRE FECHA_NACIMIENTO_SIF

* Get birth order:
by ID_MADRE: egen byte birth_order_by_mother_u = rank(FECHA_NACIMIENTO_SIF) if ID_MADRE != "NA", unique
by ID_MADRE: egen byte birth_order_by_mother_t = rank(FECHA_NACIMIENTO_SIF) if ID_MADRE != "NA", track

* Number of children born so far:
bys ID_MADRE birth_order_by_mother_t: egen byte nchilds_by_mother_t = max(birth_order_by_mother_u) if ID_MADRE != "NA"
label var nchilds_by_mother_t "Number of children born to the same mother so far"

* Codify parent's labour market status:
gen activ_p = ACTIV_PADRE == "1" | ACTIV_PADRE == "2" if real(ACTIV_PADRE) <= 2
replace activ_p = . if problematic_year == 1
label var activ_p "Father's labour market status"

gen activ_m = ACTIV_MADRE == "1" | ACTIV_MADRE == "2" if real(ACTIV_MADRE) <= 2
replace activ_m = . if problematic_year == 1
label var activ_m "Mother's labour market status"

note activ_p: activ_p = 1 if ACTIV_PADRE == 1 | ACTIV_PADRE == 2
note activ_m: activ_m = 1 if ACTIV_MADRE == 1 | ACTIV_MADRE == 2

* Create variables of labour status for each birth:
levelsof birth_order_by_mother_t, local(blist)
foreach b of local blist {
	local bb = string(`b', "%02.0f")
	by ID_MADRE: egen byte activ_p_birth`bb' = ///
		min(cond(birth_order_by_mother_t == `b', activ_p, .))
		
	label var activ_p_birth`bb' "Father's labour market status: birth `b'" 
		
	by ID_MADRE: egen byte activ_m_birth`bb' = ///
		min(cond(birth_order_by_mother_t == `b', activ_m, .))
		
	label var activ_m_birth`bb' "Mother's labour market status: birth `b'" 
}

* Create variables of labour status for NEXT birth:
gen next_activ_p = .
label var next_activ_p "Father's labour market status: next birth"
gen next_activ_m = .
label var next_activ_m "Mother's labour market status: next birth"
levelsof birth_order_by_mother_t, local(blist)
foreach b of local blist {
	local v = `b' + 1
	local vv = string(`v', "%02.0f")
	
	capture confirm variable activ_p_birth`vv', exact
	if _rc == 0 {
		replace next_activ_p = activ_p_birth`vv' if birth_order_by_mother_t == `b'
	}
	
	capture confirm variable activ_m_birth`vv', exact
	if _rc == 0 {
		replace next_activ_m = activ_m_birth`vv' if birth_order_by_mother_t == `b'
	}
	
}

* Label values of all activity variables:
label def activ 0 "Inactive" 1 "Active"
label val *activ* activ

* Transition variables:
/*
gen joinlf_m = next_activ_m if activ_m == 0 & next_activ_m != .
gen leavlf_m = next_activ_m if activ_m == 1 & next_activ_m != .
gen joinlf_p = next_activ_p if activ_p == 0 & next_activ_p != .
gen leavlf_p = next_activ_p if activ_p == 1 & next_activ_p != .
*/

gen joinlf_m = 1 if activ_m == 0 & next_activ_m == 1
replace joinlf_m = 0 if activ_m == 0 & next_activ_m == 0
label var joinlf_m "Mother joins the labour force by next birth"

gen leavlf_m = 1 if activ_m == 1 & next_activ_m == 0
replace leavlf_m = 0 if activ_m == 1 & next_activ_m == 1
label var leavlf_m "Mother leaves the labour force by next birth"

gen joinlf_p = 1 if activ_p == 0 & next_activ_p == 1
replace joinlf_p = 0 if activ_p == 0 & next_activ_p == 0
label var joinlf_p "Father joins the labour force by next birth"

gen leavlf_p = 1 if activ_p == 1 & next_activ_p == 0
replace leavlf_p = 0 if activ_p == 1 & next_activ_p == 1
label var leavlf_p "Father leaves the labour force by next birth"

label def joinlf 0 "Remains out" 1 "Joined labour force"
label def leavlf 0 "Remains in" 1 "Left labour force"

label val joinlf_? joinlf
label val leavlf_? leavlf

* Compress, label, sign, and save mother birth data:
compress
label data "Labour market participation variables"
notes drop _dta
note: Last modified by: $id_user_full ($id_user_email)
note: Last modification timestamp: $S_DATE at $S_TIME
save "ACTIVITY_VARIABLES.dta", replace

*-------------------------------------------------------------------------------
*--- 6. Generate education variables (needs MOTHER_BIRTHDATA.dta)
*-------------------------------------------------------------------------------

* Load data:
use "${nac_original}_NOGLOSAS_NODUPS_NONAS.dta", clear

* Calculate important scalars:
sum $byear_var
scalar min_ano_nac = r(min)
scalar max_ano_nac = r(max)
scalar max_mother_age = max_ano_nac - min_ano_nac

* Keep only relevant variables:
keep ID_RECIEN_NACIDO ID_MADRE NIVEL_* CURSO_*

* Merge in mother's birth data:
merge m:1 ID_MADRE using "MOTHER_BIRTHDATA.dta", ///
	gen(mrg_mbdata2main) keepusing(NIVEL_* CURSO_*)

* Drop mothers of duplicates:
drop if mrg_mbdata2main == 2

* label merge variable:
label var mrg_mbdata2main "Merge m:1 ID_MADRE using MOTHER_BIRTHDATA.dta"

* Recode education levels (3 levels and one none):
gen ed3lvls_m = 3 if NIVEL_MADRE == 1
replace ed3lvls_m = 2 if NIVEL_MADRE == 2 | NIVEL_MADRE == 3
replace ed3lvls_m = 1 if NIVEL_MADRE == 4
replace ed3lvls_m = 0 if NIVEL_MADRE == 5
label var ed3lvls_m "Mother's education level (3 levels)"

gen ed3lvls_p = 3 if NIVEL_PADRE == 1
replace ed3lvls_p = 2 if NIVEL_PADRE == 2 | NIVEL_PADRE == 3
replace ed3lvls_p = 1 if NIVEL_PADRE == 4
replace ed3lvls_p = 0 if NIVEL_PADRE == 5
label var ed3lvls_p "Father's education level (3 levels)"

gen ed3lvls_aa = 3 if NIVEL_ABUELA == 1
replace ed3lvls_aa = 2 if NIVEL_ABUELA == 2 | NIVEL_ABUELA == 3
replace ed3lvls_aa = 1 if NIVEL_ABUELA == 4
replace ed3lvls_aa = 0 if NIVEL_ABUELA == 5
label var ed3lvls_aa "Grandmother's education level (3 levels)"

gen ed3lvls_ao = 3 if NIVEL_ABUELO == 1
replace ed3lvls_ao = 2 if NIVEL_ABUELO == 2 | NIVEL_ABUELO == 3
replace ed3lvls_ao = 1 if NIVEL_ABUELO == 4
replace ed3lvls_ao = 0 if NIVEL_ABUELO == 5
label var ed3lvls_ao "Grandfather's education level (3 levels)"

egen highest_ed3lvls_mp = rowmax(ed3lvls_m ed3lvls_p)
label var highest_ed3lvls_mp "Highest education level (3 levels): parents"
egen highest_ed3lvls_aaoo = rowmax(ed3lvls_aa ed3lvls_ao)
label var highest_ed3lvls_aaoo "Highest education level (3 levels): grandparents"

label def ed3lvl_x 0 "None" 1 "Primary" 2 "Secondary" 3 "University"
label val *ed3lvls_* ed3lvl_x

* Recode education levels (2 levels):
gen ed2lvls_m = 1 if NIVEL_MADRE == 1
replace ed2lvls_m = 1 if NIVEL_MADRE == 2 & CURSO_MADRE >= 4 & CURSO_MADRE != .
replace ed2lvls_m = 0 if NIVEL_MADRE == 2 & CURSO_MADRE < 4
replace ed2lvls_m = 1 if NIVEL_MADRE == 3 & CURSO_MADRE >= 4 & CURSO_MADRE != .
replace ed2lvls_m = 0 if NIVEL_MADRE == 3 & CURSO_MADRE < 4
replace ed2lvls_m = 0 if NIVEL_MADRE == 4 | NIVEL_MADRE == 5
label var ed2lvls_m "Mother's education level (2 levels)"

gen ed2lvls_p = 1 if NIVEL_PADRE == 1
replace ed2lvls_p = 1 if NIVEL_PADRE == 2 & CURSO_PADRE >= 4 & CURSO_PADRE != .
replace ed2lvls_p = 0 if NIVEL_PADRE == 2 & CURSO_PADRE < 4
replace ed2lvls_p = 1 if NIVEL_PADRE == 3 & CURSO_PADRE >= 4 & CURSO_PADRE != .
replace ed2lvls_p = 0 if NIVEL_PADRE == 3 & CURSO_PADRE < 4
replace ed2lvls_p = 0 if NIVEL_PADRE == 4 | NIVEL_PADRE == 5
label var ed2lvls_p "Father's education level (2 levels)"

gen ed2lvls_aa = 1 if NIVEL_ABUELA == 1
replace ed2lvls_aa = 1 if NIVEL_ABUELA == 2 & CURSO_ABUELA >= 4 & CURSO_ABUELA != .
replace ed2lvls_aa = 0 if NIVEL_ABUELA == 2 & CURSO_ABUELA < 4
replace ed2lvls_aa = 1 if NIVEL_ABUELA == 3 & CURSO_ABUELA >= 4 & CURSO_ABUELA != .
replace ed2lvls_aa = 0 if NIVEL_ABUELA == 3 & CURSO_ABUELA < 4
replace ed2lvls_aa = 0 if NIVEL_ABUELA == 4 | NIVEL_ABUELA == 5
label var ed2lvls_aa "Grandmother's education level (2 levels)"

gen ed2lvls_ao = 1 if NIVEL_ABUELO == 1
replace ed2lvls_ao = 1 if NIVEL_ABUELO == 2 & CURSO_ABUELO >= 4 & CURSO_ABUELO != .
replace ed2lvls_ao = 0 if NIVEL_ABUELO == 2 & CURSO_ABUELO < 4
replace ed2lvls_ao = 1 if NIVEL_ABUELO == 3 & CURSO_ABUELO >= 4 & CURSO_ABUELO != .
replace ed2lvls_ao = 0 if NIVEL_ABUELO == 3 & CURSO_ABUELO < 4
replace ed2lvls_ao = 0 if NIVEL_ABUELO == 4 | NIVEL_ABUELO == 5
label var ed2lvls_ao "Grandfather's education level (2 levels)"

egen highest_ed2lvls_mp = rowmax(ed2lvls_m ed2lvls_p)
label var highest_ed2lvls_mp "Any parent completed secondary school"
egen highest_ed2lvls_aaoo = rowmax(ed2lvls_aa ed2lvls_ao)
label var highest_ed2lvls_aaoo "Any grandparent completed secondary school"

label def ed2lvl_x 0 "Secondary schooling not completed" 1 "Secondary schooling completed"
label val *ed2lvls_* ed2lvl_x

* Mother's education in years:
gen edmom = CURSO_MADRE if CURSO_MADRE <= 8 & (NIVEL_MADRE == 4 | NIVEL_MADRE == 5) 
replace edmom = 8 if CURSO_MADRE == 9 & (NIVEL_MADRE == 4 | NIVEL_MADRE == 5)
replace edmom = 8 + CURSO_MADRE if CURSO_MADRE <= 4 & NIVEL_MADRE == 2
replace edmom = 12 if CURSO_MADRE >= 5 & CURSO_MADRE <= 9 & NIVEL_MADRE == 2
replace edmom = 8 + CURSO_MADRE if CURSO_MADRE <= 4 & NIVEL_MADRE == 3
replace edmom = 12 if CURSO_MADRE >= 5 & CURSO_MADRE <= 9 & NIVEL_MADRE == 3
replace edmom = 12 + CURSO_MADRE if NIVEL_MADRE == 1
label var edmom "Mother's years of education"
note edmom: Constructed based on CURSO_MADRE and NIVEL_MADRE ($id_user_short on $S_DATE)
sum edmom, d
count if edmom == .

* Grandmother's education in years:
gen edgmom = CURSO_ABUELA if CURSO_ABUELA <= 8 & (NIVEL_ABUELA == 4 | NIVEL_ABUELA == 5) 
replace edgmom = 8 if CURSO_ABUELA == 9 & (NIVEL_ABUELA == 4 | NIVEL_ABUELA == 5)
replace edgmom = 8 + CURSO_ABUELA if CURSO_ABUELA <= 4 & NIVEL_ABUELA == 2
replace edgmom = 12 if CURSO_ABUELA >= 5 & CURSO_ABUELA <= 9 & NIVEL_ABUELA == 2
replace edgmom = 8 + CURSO_ABUELA if CURSO_ABUELA <= 4 & NIVEL_ABUELA == 3
replace edgmom = 12 if CURSO_ABUELA >= 5 & CURSO_ABUELA <= 9 & NIVEL_ABUELA == 3
replace edgmom = 12 + CURSO_ABUELA if NIVEL_ABUELA == 1
label var edgmom "Grandmother's years of education"
note edgmom: Constructed based on CURSO_ABUELA and NIVEL_ABUELA ($id_user_short on $S_DATE)
sum edgmom, d
count if edgmom == .

* Father's education in years:
gen edpop = CURSO_PADRE if CURSO_PADRE <= 8 & (NIVEL_PADRE == 4 | NIVEL_PADRE == 5) 
replace edpop = 8 if CURSO_PADRE == 9 & (NIVEL_PADRE == 4 | NIVEL_PADRE == 5)
replace edpop = 8 + CURSO_PADRE if CURSO_PADRE <= 4 & NIVEL_PADRE == 2
replace edpop = 12 if CURSO_PADRE >= 5 & CURSO_PADRE <= 9 & NIVEL_PADRE == 2
replace edpop = 8 + CURSO_PADRE if CURSO_PADRE <= 4 & NIVEL_PADRE == 3
replace edpop = 12 if CURSO_PADRE >= 5 & CURSO_PADRE <= 9 & NIVEL_PADRE == 3
replace edpop = 12 + CURSO_PADRE if NIVEL_PADRE == 1
label var edpop "Father's years of education"
note edpop: Constructed based on CURSO_PADRE and NIVEL_PADRE ($id_user_short on $S_DATE)
sum edpop, d
count if edpop == .

* Grandfather's education in years:
gen edgpop = CURSO_ABUELO if CURSO_ABUELO <= 8 & (NIVEL_ABUELO == 4 | NIVEL_ABUELO == 5) 
replace edgpop = 8 if CURSO_ABUELO == 9 & (NIVEL_ABUELO == 4 | NIVEL_ABUELO == 5)
replace edgpop = 8 + CURSO_ABUELO if CURSO_ABUELO <= 4 & NIVEL_ABUELO == 2
replace edgpop = 12 if CURSO_ABUELO >= 5 & CURSO_ABUELO <= 9 & NIVEL_ABUELO == 2
replace edgpop = 8 + CURSO_ABUELO if CURSO_ABUELO <= 4 & NIVEL_ABUELO == 3
replace edgpop = 12 if CURSO_ABUELO >= 5 & CURSO_ABUELO <= 9 & NIVEL_ABUELO == 3
replace edgpop = 12 + CURSO_ABUELO if NIVEL_ABUELO == 1
label var edgpop "Grandfather's years of education"
note edgpop: Constructed based on CURSO_ABUELO and NIVEL_ABUELO ($id_user_short on $S_DATE)
sum edgpop, d
count if edgpop == .


* Compress, label, sign, and save mother birth data:
compress
label data "Education variables"
notes drop _dta
note: Last modified by: $id_user_full ($id_user_email)
note: Last modification timestamp: $S_DATE at $S_TIME
save "EDUCATION_VARIABLES.dta", replace

*-------------------------------------------------------------------------------
*--- 7. Generate heaping variables (needs MOTHER_BIRTHDATA.dta)
*-------------------------------------------------------------------------------

* Load data:
use "${nac_original}_NOGLOSAS_NODUPS_NONAS.dta", clear


* Calculate important scalars:
sum $byear_var
scalar min_ano_nac = r(min)
scalar max_ano_nac = r(max)
scalar max_mother_age = max_ano_nac - min_ano_nac


* Merge in mother's birth data:
merge m:1 ID_MADRE using "MOTHER_BIRTHDATA.dta", ///
	gen(mrg_mbdata2main) keepusing(PESO_MADRE)

* Drop mothers of duplicates:
drop if mrg_mbdata2main == 2

* label merge variable:
label var mrg_mbdata2main "Merge m:1 ID_MADRE using MOTHER_BIRTHDATA.dta"

* Tag obs. with a "round" birthweight (to control for heaping):
gen round050 = mod(PESO, 50) == 0 if mod(PESO, 50)!=.
gen round050m = mod(PESO_MADRE, 50) == 0 if mod(PESO_MADRE, 50)!=.
label var round050 "=1 if person's birthweight is a multiple of 50"
label var round050m "=1 if person's mother's birthweight is a multiple of 50"
note round050m: Available only for people born after 2001

* Tag obs. with a "round" birthweight (to control for heaping):
gen round100 = mod(PESO, 100) == 0 if mod(PESO, 100)!=.
gen round100m = mod(PESO_MADRE, 100) == 0 if mod(PESO_MADRE, 100)!=.
label var round100 "=1 if person's birthweight is a multiple of 100"
label var round100m "=1 if person's mother's birthweight is a multiple of 100"
note round100m: Available only for people born after 2001

* Create heaping vars:
sum PESO, d
forval ww = 1200(50)`=floor(`r(p5)' / 10) * 10' {
	gen byte dbw`ww' = PESO == `ww' if PESO != .
	label var dbw`ww' "Heaping variable: Birth weight = `ww'"
	gen byte dbw`ww'm = PESO_MADRE == `ww' if PESO_MADRE != .
	label var dbw`ww'm "Heaping variable: Mother's Birth weight = `ww'"
	
	quietly sum dbw`ww'
	if `r(sd)' == 0 & `r(N)' > 0{
		drop dbw`ww'
	}
	
	quietly sum dbw`ww'm
	if `r(sd)' == 0 & `r(N)' > 0{
		drop dbw`ww'm
	}
}

* Compress, label, sign, and save mother birth data:
compress
label data "Heaping Variables"
notes drop _dta
note: Last modified by: $id_user_full ($id_user_email)
note: Last modification timestamp: $S_DATE at $S_TIME
save "HEAPING_VARIABLES.dta", replace

*-------------------------------------------------------------------------------
*--- 8. Generate low birthweight indicators (needs MOTHER_BIRTHDATA.dta)
*-------------------------------------------------------------------------------

* Load data:
use "${nac_original}_NOGLOSAS_NODUPS_NONAS.dta", clear


* Calculate important scalars:
sum $byear_var
scalar min_ano_nac = r(min)
scalar max_ano_nac = r(max)
scalar max_mother_age = max_ano_nac - min_ano_nac


* Keep only relevant variables:
keep ID_RECIEN_NACIDO ID_MADRE PESO

* Merge in mother's birth data:
merge m:1 ID_MADRE using "MOTHER_BIRTHDATA.dta", ///
	gen(mrg_mbdata2main) keepusing(PESO_MADRE)

* Drop mothers of duplicates:
drop if mrg_mbdata2main == 2

* label merge variable:
label var mrg_mbdata2main "Merge m:1 ID_MADRE using MOTHER_BIRTHDATA.dta"



* Create indicator variables for low birth weight:
forval w = 1000(250)4000 {
	gen bw_below_`w' = PESO < `w'
	label var bw_below_`w' "Child's birth weight below `w' grams"
	
	gen bw_below_`w'm = PESO_MADRE < `w'
	label var bw_below_`w'm "Mother's birth weight below `w' grams"
}

* Specific LBW and VLBW indicators:
gen vlbw = PESO < 1500
label var vlbw "Very Low Birth Weight: Birth weight < 1500 grams"
gen bw2k = PESO < 2000
label var bw2k "Birth weight < 2000 grams"
gen lbw = PESO < 2500
label var lbw "Low Birth Weight: Birth weight < 2500 grams"

gen vlbwm = PESO_MADRE < 1500
label var vlbwm "Mother Very Low Birth Weight: Mother's birth weight < 1500 grams"
gen bw2km = PESO_MADRE < 2000
label var bw2km "Mother's birth weight < 2000 grams"
gen lbwm = PESO_MADRE < 2500
label var lbwm "Mother Low Birth Weight: Mother's birth weight < 2500 grams"



* Compress, label, sign, and save mother birth data:
compress
label data "Low Birthweight Indicators"
notes drop _dta
note: Last modified by: $id_user_full ($id_user_email)
note: Last modification timestamp: $S_DATE at $S_TIME
save "LBW_INDICATORS.dta", replace

*-------------------------------------------------------------------------------
*--- 9. Generate birth timing variables (needs MOTHER_BIRTHDATA.dta)
*-------------------------------------------------------------------------------

use "${nac_original}_NOGLOSAS_NODUPS_NONAS.dta", clear


* Calculate important scalars:
sum $byear_var
scalar min_ano_nac = r(min)
scalar max_ano_nac = r(max)
scalar max_mother_age = max_ano_nac - min_ano_nac


* Keep only relevant variables:
keep ID_RECIEN_NACIDO ID_MADRE FECHA_NACIMIENTO_SIF PESO SEMANAS

* Merge in mother's birth data:
merge m:1 ID_MADRE using "MOTHER_BIRTHDATA.dta", ///
	gen(mrg_mbdata2main) ///
	keepusing(PESO_MADRE SEMANAS_MADRE FECHA_NACIMIENTO_SIF_MADRE)

* Drop mothers of duplicates:
drop if mrg_mbdata2main == 2

* label merge variable:
label var mrg_mbdata2main "Merge m:1 ID_MADRE using MOTHER_BIRTHDATA.dta"


* Weekend births:
gen byte dayofweek = dow(FECHA_NACIMIENTO_SIF)
label var dayofweek "Birth day of the week (Sunday = 0)"

gen byte wkndbirth = (dayofweek == 0 | dayofweek == 6) if dayofweek != .
label var wkndbirth "Birth during weekend (Saturday or Sunday)"

gen byte wkndbirth2 = (dayofweek == 0 | dayofweek == 5 | dayofweek == 6) if dayofweek != .
label var wkndbirth2 "Birth during weekend (Friday, Saturday, or Sunday)"

gen byte dayofweekm = dow(FECHA_NACIMIENTO_SIF_MADRE)
label var dayofweekm "Mother: Birth day of the week (Sunday = 0)"

gen byte wkndbirthm = (dayofweekm == 0 | dayofweekm == 6) if dayofweekm != .
label var wkndbirthm "Mother: Birth during weekend (Saturday or Sunday)"

gen byte wkndbirth2m = (dayofweekm == 0 | dayofweekm == 5 | dayofweekm == 6) if dayofweekm != .
label var wkndbirth2m "Mother: Birth during weekend (Friday, Saturday, or Sunday)"

* Create dummy for 32 or more weeks of gestation:
gen sem32 = SEMANAS >= 32 if SEMANAS != .
label var sem32 "Gestational Weeks >= 32"
label def sem32 0 "Gestational Weeks <= 31" 1 "Gestational Weeks >= 32"
label val sem32 sem32

* Create dummy for 32 or more weeks of gestation (mother):
gen sem32m = SEMANAS_MADRE >= 32 if SEMANAS_MADRE != .
label var sem32m "Mother's Gestational Weeks >= 32"
label def sem32m 0 "Mother's Gestational Weeks <= 31" 1 "Mother's Gestational Weeks >= 32"
label val sem32m sem32m


* Generate pre-term indicators:
forval w = 27(1)40 {
	gen gw_below_`w' = SEMANAS <= `w'
	label var gw_below_`w' "Child's gestational length below `w' weeks"
}


* Compress, label, sign, and save mother birth data:
compress
label data "Birth timing variables"
notes drop _dta
note: Last modified by: $id_user_full ($id_user_email)
note: Last modification timestamp: $S_DATE at $S_TIME
save "BIRTH_TIMING_VARIABLES.dta", replace

*-------------------------------------------------------------------------------
*--- 10. Generate mortality variables (needs pre-processed NAC and DEF data)
*-------------------------------------------------------------------------------

* Load data:
use "${nac_original}_NOGLOSAS_NODUPS_NONAS.dta", clear


* Calculate important scalars:
sum $byear_var
scalar min_ano_nac = r(min)
scalar max_ano_nac = r(max)
scalar max_mother_age = max_ano_nac - min_ano_nac


* Keep relevant variables:
keep ID_RECIEN_NACIDO FECHA_NACIMIENTO_SIF

* Copy ID variable:
gen ID_FALLECIDO = ID_RECIEN_NACIDO

* Merge with deaths data (also without duplicates and w/o NAs):
merge 1:1 ID_FALLECIDO ///
	using "${def_original}_NOGLOSAS_NODUPS_NONAS.dta", ///
	gen(mrg_DEF2NAC) keep(master match) ///
	keepusing(FECHA_DEF_SIF ANO_DEF)
drop ID_FALLECIDO
label var mrg_DEF2NAC "Merge 1:1 ID_FALLECIDO/RECIEN_NACIDO"
	
sum ANO_DEF
scalar max_ano_def = r(max)

* Age by end of death dataset:
personage FECHA_NACIMIENTO_SIF, currdate(`=mdy(12, 31, `=max_ano_def')') g(age_end_of_ddata)
label var age_end_of_ddata `"Age by the end of the death dataset: `=string(`=mdy(12, 31, `=max_ano_def')', "%td")'"'

* Age at death:
personage FECHA_NACIMIENTO_SIF FECHA_DEF_SIF if mrg_DEF2NAC == 3, g(age_at_death)
replace age_at_death = . if age_at_death < 0
label var age_at_death "Age at the time of death"

* Dummy variables for mortality by age in years:
levelsof age_at_death, local(death_years)
foreach a of local death_years {
	local aa = string(`a', "%02.0f")
	gen byte dead_at_a`aa' = age_at_death == `a' if age_end_of_ddata >= `a' & age_end_of_ddata != .
	replace dead_at_a`aa' = . if age_at_death < `a'
	label var dead_at_a`aa' "Died at age `a'"
}

* Dummy variables for mortality by age in months:
gen days_death = FECHA_DEF_SIF - FECHA_NACIMIENTO_SIF
label var days_death "Days alive until death"
gen months_death = floor(days_death / (365/12))
label var months_death "Months alive until death"

forval m = 0(1)11 {
	local mm = string(`m', "%02.0f")
	gen dead_at_m`mm' = months_death >= 0 & months_death == `m'
	replace dead_at_m`mm' = . if ym(2018, 12) - mofd(FECHA_NACIMIENTO_SIF) < `m'
	replace dead_at_m`mm' = . if months_death < `m'
	label var dead_at_m`mm' "Infant Mortality (death within `m' months after birth)"
}


* Compress, label, sign, and save mother birth data:
compress
label data "Mortality variables"
notes drop _dta
note: Last modified by: $id_user_full ($id_user_email)
note: Last modification timestamp: $S_DATE at $S_TIME
save "MORTALITY_VARIABLES.dta", replace

*-------------------------------------------------------------------------------
*--- 11. Generate income variables (needs CASEN data)
*-------------------------------------------------------------------------------

* Load data:
use "${nac_original}_NOGLOSAS_NODUPS_NONAS.dta", clear


* Calculate important scalars:
sum $byear_var
scalar min_ano_nac = r(min)
scalar max_ano_nac = r(max)
scalar max_mother_age = max_ano_nac - min_ano_nac

* Define list of problematic years:
local probyears 1992 2000 2007 2015


* Keep only relevant variables:
keep ID_RECIEN_NACIDO ID_MADRE FECHA_NACIMIENTO_SIF ANO_NAC REGION_RESIDENCIA OCUPA_?ADRE ACTIV_?ADRE EDAD_?ADRE

* Identify problematic years:
gen byte problematic_year = inlist(ANO_NAC, `=subinstr("`probyears'", " ", ", ", .)')
label var problematic_year "Issues with the income variable for this year"

* Sort:
sort ID_MADRE FECHA_NACIMIENTO_SIF

* Get birth order:
by ID_MADRE: egen byte birth_order_by_mother_u = rank(FECHA_NACIMIENTO_SIF) if ID_MADRE != "NA", unique
by ID_MADRE: egen byte birth_order_by_mother_t = rank(FECHA_NACIMIENTO_SIF) if ID_MADRE != "NA", track

* Number of children born so far:
bys ID_MADRE birth_order_by_mother_t: egen byte nchilds_by_mother_t = max(birth_order_by_mother_u) if ID_MADRE != "NA"
label var nchilds_by_mother_t "Number of children born to the same mother so far"


* Create age ranges for parents:
egen agerange_PADRE = cut(EDAD_PADRE), at(20(5)50) icodes
replace agerange_PADRE = agerange_PADRE + 1
egen agerange_MADRE = cut(EDAD_MADRE), at(20(5)50) icodes
replace agerange_MADRE = agerange_MADRE + 1

label def agerange 1 "20-24" 2 "25-29" 3 "30-34" 4 "35-39" 5 "40-44" 6 "45-49", replace
label val agerange_?ADRE agerange

* Merge in incomes adjusted by age range from CASEN:
quietly ds
local premerge_vlist `r(varlist)'
#delimit ;
merge m:1 ANO_NAC REGION_RESIDENCIA OCUPA_PADRE agerange_PADRE
          using "$dtadir/CASEN/ipolatefather_tramo_renamed.dta", keep(1 3) nogen
		  keepusing(*PADRE);
		  
merge m:1 ANO_NAC REGION_RESIDENCIA OCUPA_MADRE agerange_MADRE
          using "$dtadir/CASEN/ipolatemother_tramo_renamed.dta", keep(1 3) nogen
		  keepusing(*_MADRE);	  
#delimit cr
ds `premerge_vlist', not
local byage_income_variables `r(varlist)'

* Merge in incomes not adjusted by age range from CASEN:
quietly ds
local premerge_vlist `r(varlist)'
#delimit ;
merge m:1 ANO_NAC REGION_RESIDENCIA OCUPA_PADRE 
          using "$dtadir/CASEN/ipolatefather_renamed.dta", keep(1 3) nogen
          keepusing(*_PADRE);
 
merge m:1 ANO_NAC REGION_RESIDENCIA OCUPA_MADRE 
          using "$dtadir/CASEN/ipolatemother_renamed.dta", keep(1 3) nogen
          keepusing(*_MADRE);
#delimit cr
ds `premerge_vlist', not
local notbyage_income_variables `r(varlist)'

* Replace incomes as missing if ACTIV_?ADRE != 1
foreach var of varlist `byage_income_variables' `notbyage_income_variables' {
	local parent = substr("`var'", -5, 1)
	replace `var' = . if ACTIV_`parent'ADRE != "1"
}



* Total household real income (interpolated, not adjusted for age):
egen tothhryi = rowtotal(ryi_?ADRE) if problematic_year == 0, missing
label var tothhryi "Total household real income (int.)"

* Number of parents earning income (interpolated, not adjusted for age):
egen byte numearners_ryi = rownonmiss(ryi_?ADRE) if problematic_year == 0
label var numearners_ryi "Number of real income earning parents (int.)"

* Total household real income per capita (interpolated, not adjusted for age):
gen tothhryipc = tothhryi / (numearners_ryi + nchilds_by_mother) if ID_MADRE != "NA" & ANO_NAC >= 2001 & problematic_year == 0
label var tothhryipc "Total household real income per capita (int.)"
note tothhryipc: not available for years prior to 2001, when ID_MADRE is not available.
note tothhryipc: caution: might not be accurate for births in the early 2000s because first child could have been born prior to 2001, and no ID_MADRE is available.

* Total household real income divided by earners and 1 child (interpolated, not adjusted for age):
gen tothhryipc3 = tothhryi / (numearners_ryi + 1) if problematic_year == 0
label var tothhryipc3 "Total household real income per earners and 1 child (int.)"
note tothhryipc3: does not rely on availability of ID_MADRE, so is appropriate for years `=min_ano_nac' - `=max_ano_nac'

* Average household real income (interpolated, not adjusted for age):
egen avghhryi = rowmean(ryi_?ADRE) if problematic_year == 0
label var avghhryi "Average household real income (int.)"


* Total household real income (interpolated, adjusted for age):
egen tothhryai = rowtotal(ryai_?ADRE) if problematic_year == 0, missing
label var tothhryai "Total household real age-adjusted income (int.)"

* Number of parents earning income (interpolated, adjusted for age):
egen byte numearners_ryai = rownonmiss(ryai_?ADRE) if problematic_year == 0
label var numearners_ryai "Number of real age-adjusted income earning parents (int.)"

* Total household real income per capita (interpolated, adjusted for age):
gen tothhryaipc = tothhryai / (numearners_ryai + nchilds_by_mother) if ID_MADRE != "NA" & ANO_NAC >= 2001 & problematic_year == 0
label var tothhryaipc "Total household real age-adjusted income per capita (int.)"
note tothhryaipc: not available for years prior to 2001, when ID_MADRE is not available.
note tothhryaipc: caution: might not be accurate for births in the early 2000s because first child could have been born prior to 2001, and no ID_MADRE is available.

* Total household real income divided by earners and 1 child (interpolated, adjusted for age):
gen tothhryaipc3 = tothhryai / (numearners_ryai + 1) if problematic_year == 0
label var tothhryaipc3 "Total household real age-adjusted income per earners and 1 child (int.)"
note tothhryaipc3: does not rely on availability of ID_MADRE, so is appropriate for years `=min_ano_nac' - `=max_ano_nac'

* Average household real income (interpolated, adjusted for age):
egen avghhryai = rowmean(ryai_?ADRE) if problematic_year == 0
label var avghhryai "Average household real age-adjusted income (int.)"


local v = 2
foreach var of varlist ???hhry*i* {
	timer on `v'
	* Find percentiles:
	bys ANO_NAC: egen p20_`var' = pctile(`var') if problematic_year == 0 & `var' != ., p(20)
	bys ANO_NAC: egen p25_`var' = pctile(`var') if problematic_year == 0 & `var' != ., p(25)
	bys ANO_NAC: egen p33_`var' = pctile(`var') if problematic_year == 0 & `var' != ., p(33)
	bys ANO_NAC: egen p40_`var' = pctile(`var') if problematic_year == 0 & `var' != ., p(40)
	bys ANO_NAC: egen p50_`var' = pctile(`var') if problematic_year == 0 & `var' != ., p(50)
	bys ANO_NAC: egen p60_`var' = pctile(`var') if problematic_year == 0 & `var' != ., p(60)
	bys ANO_NAC: egen p66_`var' = pctile(`var') if problematic_year == 0 & `var' != ., p(66)
	bys ANO_NAC: egen p75_`var' = pctile(`var') if problematic_year == 0 & `var' != ., p(75)
	bys ANO_NAC: egen p80_`var' = pctile(`var') if problematic_year == 0 & `var' != ., p(80)

	* Define terciles:
	gen byte tercile_`var' = 1 if `var' <= p33_`var' & problematic_year == 0
	replace tercile_`var' = 2 if `var' > p33_`var' & `var' <= p66_`var' & problematic_year == 0
	replace tercile_`var' = 3 if `var' > p66_`var' & `var' != . & problematic_year == 0
	local `var'_lbl : variable label `var'
	label var tercile_`var' "Tercile of ``var'_lbl'"

	* Define quartiles:
	gen byte quartile_`var' = 1 if `var' <= p25_`var' & problematic_year == 0
	replace quartile_`var' = 2 if `var' > p25_`var' & `var' <= p50_`var' & problematic_year == 0
	replace quartile_`var' = 3 if `var' > p50_`var' & `var' <= p75_`var' & problematic_year == 0
	replace quartile_`var' = 4 if `var' > p75_`var' & `var' != . & problematic_year == 0
	label var quartile_`var' "Quartile of ``var'_lbl'"

	* Define quintiles:
	gen byte quintile_`var' = 1 if `var' <= p20_`var' & problematic_year == 0
	replace quintile_`var' = 2 if `var' > p20_`var' & `var' <= p40_`var' & problematic_year == 0
	replace quintile_`var' = 3 if `var' > p40_`var' & `var' <= p60_`var' & problematic_year == 0
	replace quintile_`var' = 4 if `var' > p60_`var' & `var' <= p80_`var' & problematic_year == 0
	replace quintile_`var' = 5 if `var' > p80_`var' & `var' != . & problematic_year == 0
	label var quintile_`var' "Quintile of ``var'_lbl'"
	
	timer off `v'
	local v = `v' + 1
	
	* Drop unnecessary variables:
	drop p??_`var'
}


* Compress, label, sign, and save mother birth data:
compress
label data "Income variables"
notes drop _dta
note: Last modified by: $id_user_full ($id_user_email)
note: Last modification timestamp: $S_DATE at $S_TIME
save "INCOME_VARIABLES.dta", replace

*-------------------------------------------------------------------------------
*--- 12. Generate mother's income variables (needs INCOME_VARIABLES.dta and ID_RECIEN_NACIDO_of_unique_ID_MADRE.dta)
*-------------------------------------------------------------------------------

* Load income data:
use "INCOME_VARIABLES.dta", clear

* Keep only relevant variables:
keep ID_RECIEN_NACIDO *_PADRE *_MADRE *hhryi* *hhryai*

* Merge in list of unique mother ID's:
merge 1:1 ID_RECIEN_NACIDO using "ID_RECIEN_NACIDO_of_unique_ID_MADRE.dta", keep(match) nogen

* Rename father variables to grandfather:
foreach var of varlist *_PADRE {
	local varlbl : variable label `var'
	local nvar = subinstr("`var'", "_PADRE", "_ABUELO", .)
	rename `var' `nvar'
	label var `nvar' "Abuelo: `varlbl'"
}

* Rename mother variables to grandmother:
foreach var of varlist *_MADRE {
	local varlbl : variable label `var'
	local nvar = subinstr("`var'", "_MADRE", "_ABUELA", .)
	rename `var' `nvar'
	label var `nvar' "Abuela: `varlbl'"
}

* Rename other income variables:
foreach var of varlist *hhryi* *hhryai* {
	local varlbl : variable label `var'
	rename `var' `var'm
	label var `var'm "Mother's h'hold: `varlbl'"
}

* Rename and relabel ID variables:
rename ID_RECIEN_NACIDO ID_MADRE
label var ID_MADRE "Identificador único y anónimo de la madre del recién nacido vivo"

* Compress, label, sign, and save mother birth data:
compress
label data "Mother income data"
notes drop _dta
note: Last modified by: $id_user_full ($id_user_email)
note: Last modification timestamp: $S_DATE at $S_TIME
save "MOTHER_INCOME_DATA.dta", replace

*-------------------------------------------------------------------------------
*--- 13. Generate mother's ISAPRE status before and after each birth
*-------------------------------------------------------------------------------

* Load data:
use "${nac_original}_NOGLOSAS_NODUPS_NONAS.dta", clear


* Calculate important scalars:
sum $byear_var
scalar min_ano_nac = r(min)
scalar max_ano_nac = r(max)
scalar max_mother_age = max_ano_nac - min_ano_nac



* Drop observations with unknown mother id:
drop if ID_MADRE == "NA" | ID_MADRE == ""

* Keep only mother IDs and birth dates:
keep ID_MADRE FECHA_NACIMIENTO_SIF
duplicates drop

* Get birth order:
bys ID_MADRE: egen birth_order = rank(FECHA_NACIMIENTO_SIF), track

* Reshape births wide:
reshape wide FECHA_NACIMIENTO_SIF , i(ID_MADRE) j(birth_order)

* Create copy of mother ID:
gen ID_PACIENTE = ID_MADRE

* Merge in EEHH data:
merge 1:m ID_PACIENTE using EEHH_2001_2019_NODUPS_NONAS.dta, ///
	nogen keep(match) ///
	keepusing(PERTENENCIA_SNSS PREVISION FECHA_EGRESO DIAS_ESTADA)

* Create admission date:
gen fecha_ingreso = FECHA_EGRESO - DIAS_ESTADA
format fecha_ingreso %td

* Classify hospital visits into periods:
sort ID_PACIENTE FECHA_EGRESO
by ID_PACIENTE: gen period = 0 if fecha_ingreso <= FECHA_NACIMIENTO_SIF1
by ID_PACIENTE: replace period = 1 if fecha_ingreso > FECHA_NACIMIENTO_SIF1 & fecha_ingreso <= FECHA_NACIMIENTO_SIF2
by ID_PACIENTE: replace period = 2 if fecha_ingreso > FECHA_NACIMIENTO_SIF2 & fecha_ingreso <= FECHA_NACIMIENTO_SIF3
by ID_PACIENTE: replace period = 3 if fecha_ingreso > FECHA_NACIMIENTO_SIF3 & fecha_ingreso <= FECHA_NACIMIENTO_SIF4
by ID_PACIENTE: replace period = 4 if fecha_ingreso > FECHA_NACIMIENTO_SIF4 & fecha_ingreso <= FECHA_NACIMIENTO_SIF5
by ID_PACIENTE: replace period = 5 if fecha_ingreso > FECHA_NACIMIENTO_SIF5 & fecha_ingreso <= FECHA_NACIMIENTO_SIF6
by ID_PACIENTE: replace period = 6 if fecha_ingreso > FECHA_NACIMIENTO_SIF6 & fecha_ingreso <= FECHA_NACIMIENTO_SIF7
by ID_PACIENTE: replace period = 7 if fecha_ingreso > FECHA_NACIMIENTO_SIF7 & fecha_ingreso <= FECHA_NACIMIENTO_SIF8
by ID_PACIENTE: replace period = 8 if fecha_ingreso > FECHA_NACIMIENTO_SIF8 & fecha_ingreso <= FECHA_NACIMIENTO_SIF9
by ID_PACIENTE: replace period = 9 if fecha_ingreso > FECHA_NACIMIENTO_SIF9 & fecha_ingreso <= FECHA_NACIMIENTO_SIF10
by ID_PACIENTE: replace period = 10 if fecha_ingreso > FECHA_NACIMIENTO_SIF10

* Create dummy variable for whether visit is covered by ISAPRE or not:
gen isapre = PREVISION == 2 if PREVISION <= 2 | PREVISION == 96

* Create dummy variable for whether visit was to a private clinic or not:
gen private = PERTENENCIA_SNSS == 2

* Sort by patient, period, and date:
sort ID_PACIENTE period fecha_ingreso

* Obtain share of visits covered by ISAPRE by period:
by ID_PACIENTE period: egen share_isapre = mean(isapre)
by ID_PACIENTE period: egen any_isapre = max(isapre)

* Obtain share of visits to a private clinic by period:
by ID_PACIENTE period: egen share_private = mean(private)
by ID_PACIENTE period: egen any_private = max(private)

* Keep only relevante variables:
keep ID_MADRE FECHA_NACIMIENTO_SIF* period share_* any_*

* Keep one obvservation per ID/Period combination:
duplicates drop
compress

* Reshape wide according to period:
reshape wide share_* any_*, i(ID_MADRE) j(period)

* Reshape long according to birth order:
reshape long FECHA_NACIMIENTO_SIF, i(ID_MADRE) j(birth_order)

* Drop irrelevant observations:
drop if FECHA_NACIMIENTO_SIF == .

* Organize variables:
order FECHA_NACIMIENTO_SIF ///
	share_isapre* any_isapre* ///
	share_private* any_private*, after(ID_MADRE)

* Label newly created variables:
foreach var of varlist share_isapre* {
	local n = real(subinstr("`var'", "share_isapre", "", 1))
	label var `var' "Share of hospital visits covered by ISAPRE after birth `n' and before birth `=`n'+1'"
}
foreach var of varlist any_isapre* {
	local n = real(subinstr("`var'", "any_isapre", "", 1))
	label var `var' "At least one hospital visit covered by ISAPRE after birth `n' and before birth `=`n'+1'"
}

foreach var of varlist share_private* {
	local n = real(subinstr("`var'", "share_private", "", 1))
	label var `var' "Share of hospital visits to private clinics after birth `n' and before birth `=`n'+1'"
}
foreach var of varlist any_isapre* {
	local n = real(subinstr("`var'", "any_isapre", "", 1))
	label var `var' "At least one hospital visit to a private clinic after birth `n' and before birth `=`n'+1'"
}

* Compress, label, sign, and save mother birth data:
compress
label data "Mother ISAPRE and private clinic status by birth"
notes drop _dta
note: Last modified by: $id_user_full ($id_user_email)
note: Last modification timestamp: $S_DATE at $S_TIME
save "ISAPRECLINIC_MOTHERS.dta", replace

*-------------------------------------------------------------------------------
*--- 14. INCOME TRANSITION VARIABLES 
*-------------------------------------------------------------------------------

* Load data:
use "${nac_original}_NOGLOSAS_NODUPS_NONAS.dta", clear


* Calculate important scalars:
sum $byear_var
scalar min_ano_nac = r(min)
scalar max_ano_nac = r(max)
scalar max_mother_age = max_ano_nac - min_ano_nac

* Define list of problematic years:
local probyears 1992 2000 2007 2015


* Keep only relevant variables:
keep ID_RECIEN_NACIDO ID_MADRE FECHA_NACIMIENTO_SIF ANO_NAC ACTIV_?ADRE

* Identify problematic years:
gen byte problematic_year = inlist(ANO_NAC, `=subinstr("`probyears'", " ", ", ", .)')
label var problematic_year "Issues with the income variable for this year"

* Sort:
sort ID_MADRE FECHA_NACIMIENTO_SIF

* Get birth order:
by ID_MADRE: egen byte birth_order_by_mother_u = rank(FECHA_NACIMIENTO_SIF) if ID_MADRE != "NA", unique
by ID_MADRE: egen byte birth_order_by_mother_t = rank(FECHA_NACIMIENTO_SIF) if ID_MADRE != "NA", track

* Number of children born so far:
bys ID_MADRE birth_order_by_mother_t: egen byte nchilds_by_mother_t = max(birth_order_by_mother_u) if ID_MADRE != "NA"
label var nchilds_by_mother_t "Number of children born to the same mother so far"


* Merge in income variables:
merge 1:1 ID_RECIEN_NACIDO using "INCOME_VARIABLES.dta", nogen keepusing(*_PADRE *_MADRE *hhry* num*)
drop EDAD_* OCUPA_* agerange_*

ds ryai_?ADRE ???hhryai* hi_?ADRE shrfti_?ADRE
local income_vlist `r(varlist)'

* Create variables of income and hours for each birth:
sort ID_MADRE FECHA_NACIMIENTO_SIF
levelsof birth_order_by_mother_t if birth_order_by_mother_t <= 5, local(blist)
foreach b of local blist {
	local bb = string(`b', "%02.0f")
	
	foreach var of varlist `income_vlist' {
		local vlbl : variable label `var'
		
		by ID_MADRE: egen `var'_b`bb' = ///
			min(cond(birth_order_by_mother_t == `b', `var', .))
			
		label var `var'_b`bb' "`vlbl': birth `b'" 
	}
}

* Create variables of income and hours for NEXT birth:
foreach var of varlist `income_vlist' {
	local vlbl : variable label `var'
	
	gen next_`var' = .
	label var next_`var' "`vlbl': next birth"
}

* Fill in variables of income and hours for NEXT birth:
levelsof birth_order_by_mother_t if birth_order_by_mother_t <= 5, local(blist)
foreach b of local blist {
	local v = `b' + 1
	local vv = string(`v', "%02.0f")
	
	foreach var of varlist `income_vlist' {
		capture confirm variable `var'_b`vv', exact
		if _rc == 0 {
			replace next_`var' = `var'_b`vv' if birth_order_by_mother_t == `b'
		}
	}
}

* Compress, label, sign, and save mother birth data:
compress
label data "Income transition variables"
notes drop _dta
note: Last modified by: $id_user_full ($id_user_email)
note: Last modification timestamp: $S_DATE at $S_TIME
save "INCOME_TRANSITION_VARIABLES.dta", replace

*-------------------------------------------------------------------------------
*--- 15. BLN 2013 VARIABLES
*-------------------------------------------------------------------------------

* Load data:
use "${nac_original}_NOGLOSAS_NODUPS_NONAS.dta", clear


* Calculate important scalars:
sum $byear_var
scalar min_ano_nac = r(min)
scalar max_ano_nac = r(max)
scalar max_mother_age = max_ano_nac - min_ano_nac


* Keep only relevant variables:
keep ID_RECIEN_NACIDO ID_MADRE ///
	EST_CIV_MADRE TIPO_ATEN REGION_RESIDENCIA ANO_NAC EDAD_MADRE

* Merge in mother's birth data:
merge m:1 ID_MADRE using "MOTHER_BIRTHDATA.dta", ///
	gen(mrg_mbdata2main) ///
	keepusing(EST_CIV_ABUELA TIPO_ATEN_MADRE REGION_RESIDENCIA_ABUELA ///
	ANO_NAC_MADRE EDAD_ABUELA)

* Drop mothers of duplicates:
drop if mrg_mbdata2main == 2

* label merge variable:
label var mrg_mbdata2main "Merge m:1 ID_MADRE using MOTHER_BIRTHDATA.dta"

* Merge in education data:
merge 1:1 ID_RECIEN_NACIDO using EDUCATION_VARIABLES.dta, ///
	nogen keepusing(edmom edgmom)

* Married status:
* Mother:
gen byte married = EST_CIV_MADRE == 2 | EST_CIV_MADRE == 6 if EST_CIV_MADRE != 9
label var married "Mother's married status at time of birth"

* Grandmother:
gen byte marriedm = EST_CIV_ABUELA == 2 | EST_CIV_ABUELA == 6 if EST_CIV_ABUELA != 9
label var marriedm "Grandmother's married status at time of birth of the mother" 

label def married 1 "Married/Civil Union" 0 "Single/Widow/Divorced"
label val married marriedm married

* Type of attending:
* Mother:
gen byte doc_aten = TIPO_ATEN == 1 if TIPO_ATEN != . & TIPO_ATEN != 9
label var doc_aten "Type of medical attention"

* Grandmother:
gen byte doc_atenm = TIPO_ATEN_MADRE == 1 if TIPO_ATEN_MADRE != . & TIPO_ATEN_MADRE != 9
label var doc_atenm "Mother's type of medical attention"

label def doc_aten 1 "Doctor" 0 "Midwife/None/Other"
label val doc_aten doc_atenm doc_aten

* Region:
* Mother:
levelsof REGION_RESIDENCIA, local(reg_list)
foreach r of local reg_list {
	local rr = string(`r', "%02.0f")
	local vlbl : label REGION_RESIDENCIA `r', strict
	gen byte bregion_`rr' = REGION_RESIDENCIA == `r' if REGION_RESIDENCIA != .
	label var bregion_`rr' "Birth region: `vlbl'"
}

* Grandmother:
levelsof REGION_RESIDENCIA_ABUELA, local(reg_listm)
foreach r of local reg_listm {
	local rr = string(`r', "%02.0f")
	local vlbl : label REGION_RESIDENCIA_ABUELA `r', strict
	gen byte bregion_`rr'm = REGION_RESIDENCIA_ABUELA == `r' if REGION_RESIDENCIA_ABUELA != .
	label var bregion_`rr'm "Mother's birth region: `vlbl'"
}

* Year of birth:
* Mother:
levelsof ANO_NAC, local(year_list)
foreach yyyy of local year_list {
	gen byte byear_`yyyy' = ANO_NAC == `yyyy' if ANO_NAC != .
	label var byear_`yyyy' "Birth year: `yyyy'"
}

* Grandmother:
levelsof ANO_NAC_MADRE, local(year_listm)
foreach yyyy of local year_listm {
	gen byte byear_`yyyy'm = ANO_NAC_MADRE == `yyyy' if ANO_NAC_MADRE != .
	label var byear_`yyyy'm "Mother's birth year: `yyyy'"
}


egen nmissg1_blnctrls = rowmiss(edmom EDAD_MADRE married doc_aten bregion_?? byear_????)
egen nmissg2_blnctrls = rowmiss(edgmom EDAD_ABUELA marriedm doc_atenm bregion_??m byear_????m)

gen g1ok_blnctrls = nmissg1_blnctrls == 0
gen g2ok_blnctrls = nmissg2_blnctrls == 0 if mrg_mbdata2main == 3

label var g1ok_blnctrls "All BLN (2013) controls non-missing (1st generation)"
label var g2ok_blnctrls "All BLN (2013) controls non-missing (2nd generation)"

keep ID_RECIEN_NACIDO g1ok_blnctrls g2ok_blnctrls ///
	edmom EDAD_MADRE married doc_aten bregion_?? byear_???? ///
	edgmom EDAD_ABUELA marriedm doc_atenm bregion_??m byear_????m

* Compress, label, sign, and save mother birth data:
compress
label data "Control variables according to Bharadwadj, Loken, and Neilson (2013)"
notes drop _dta
note: Last modified by: $id_user_full ($id_user_email)
note: Last modification timestamp: $S_DATE at $S_TIME
save "BLN2013_CONTROLS.dta", replace



*-----------------
* 14. Closing
*-----------------
timer off 1
timer list 
